library(RPostgreSQL)
## Loading required package: DBI
library(ggplot2)
library(tidyverse)
<<<<<<< HEAD
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
=======
## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
>>>>>>> ddb2e9d9d49f7fe95434c9e1a6b18bdd93fce00c
## ✔ tibble  1.4.2     ✔ purrr   0.2.4
## ✔ tidyr   0.8.0     ✔ dplyr   0.7.4
## ✔ readr   1.1.1     ✔ stringr 1.3.0
## ✔ tibble  1.4.2     ✔ forcats 0.3.0
<<<<<<< HEAD
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
=======
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
>>>>>>> ddb2e9d9d49f7fe95434c9e1a6b18bdd93fce00c
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggthemes)
library(ggcorrplot)
library(purrr)

# load the PostgreSQL driver
drv <- dbDriver("PostgreSQL")

# create a connection to the postgres database
# set the search path to the mimiciii schema
con <- dbConnect(drv, dbname = "mimic",
                 host = "localhost", port = 5432,
                 user = "DavidSasson")
dbSendQuery(con, 'set search_path to mimiciii')
## <PostgreSQLResult>
# test connection
# dbExistsTable(con, "patients")

# show a list of tables
dbListTables(con)
##  [1] "chartevents_1"      "chartevents_2"      "chartevents_3"     
##  [4] "chartevents_4"      "chartevents_5"      "chartevents_6"     
##  [7] "chartevents_7"      "chartevents_8"      "chartevents_9"     
## [10] "chartevents_10"     "chartevents_11"     "chartevents_12"    
## [13] "chartevents_13"     "chartevents_14"     "chartevents_15"    
## [16] "chartevents"        "admissions"         "callout"           
## [19] "caregivers"         "chartevents_16"     "chartevents_17"    
## [22] "cptevents"          "datetimeevents"     "diagnoses_icd"     
## [25] "drgcodes"           "d_cpt"              "d_icd_diagnoses"   
## [28] "d_icd_procedures"   "d_items"            "d_labitems"        
## [31] "icustays"           "inputevents_cv"     "inputevents_mv"    
## [34] "labevents"          "microbiologyevents" "noteevents"        
## [37] "outputevents"       "patients"           "prescriptions"     
## [40] "procedureevents_mv" "procedures_icd"     "services"          
## [43] "transfers"

This snippet makes several assumptions with respect to the way the database is set up.

  • dbname: It assumes the name of your database is the same as your local username (i.e., the result of whoami). If this is not the case, you should pass a different value indicating the name of your database (e.g., dbname='mimic').
  • host: It assumes the database is hosted locally on your machine (i.e., on localhost).
  • port: It assumes Postgres is listening on the default port, 5432.
  • user: It assumes the current user has access to the database.
  • password: It assumes no password is required for this user, usually because Postgres is using peer authentication.

Data Exploration with ggplot2

admins = dbGetQuery(con, "select * from admissions")

str(admins)
## 'data.frame':    58976 obs. of  19 variables:
##  $ row_id              : int  21 22 23 24 25 26 27 28 29 30 ...
##  $ subject_id          : int  22 23 23 24 25 26 27 28 30 31 ...
##  $ hadm_id             : int  165315 152223 124321 161859 129635 197661 134931 162569 104557 128652 ...
##  $ admittime           : POSIXct, format: "2196-04-09 12:26:00" "2153-09-03 07:15:00" ...
##  $ dischtime           : POSIXct, format: "2196-04-10 15:54:00" "2153-09-08 19:10:00" ...
##  $ deathtime           : POSIXct, format: NA NA ...
##  $ admission_type      : chr  "EMERGENCY" "ELECTIVE" "EMERGENCY" "EMERGENCY" ...
##  $ admission_location  : chr  "EMERGENCY ROOM ADMIT" "PHYS REFERRAL/NORMAL DELI" "TRANSFER FROM HOSP/EXTRAM" "TRANSFER FROM HOSP/EXTRAM" ...
##  $ discharge_location  : chr  "DISC-TRAN CANCER/CHLDRN H" "HOME HEALTH CARE" "HOME HEALTH CARE" "HOME" ...
##  $ insurance           : chr  "Private" "Medicare" "Medicare" "Private" ...
##  $ language            : chr  NA NA "ENGL" NA ...
##  $ religion            : chr  "UNOBTAINABLE" "CATHOLIC" "CATHOLIC" "PROTESTANT QUAKER" ...
##  $ marital_status      : chr  "MARRIED" "MARRIED" "MARRIED" "SINGLE" ...
##  $ ethnicity           : chr  "WHITE" "WHITE" "WHITE" "WHITE" ...
##  $ edregtime           : POSIXct, format: "2196-04-09 10:06:00" NA ...
##  $ edouttime           : POSIXct, format: "2196-04-09 13:24:00" NA ...
##  $ diagnosis           : chr  "BENZODIAZEPINE OVERDOSE" "CORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS GRAFT/SDA" "BRAIN MASS" "INTERIOR MYOCARDIAL INFARCTION" ...
##  $ hospital_expire_flag: int  0 0 0 0 0 0 0 0 0 1 ...
##  $ has_chartevents_data: int  1 1 1 1 1 1 1 1 1 1 ...
x <- ggplot(admins)

x + geom_bar(aes(religion)) + coord_flip() + theme_minimal()

x + geom_bar(aes(insurance)) + facet_wrap(~ admission_type) + theme_fivethirtyeight()

x + geom_bar(aes(ethnicity)) + coord_flip() + theme_tufte()

x + geom_count(aes(x = insurance, y = ethnicity), alpha=0.3, col="red") + 
  scale_size_area() +
  theme_classic() 

demo.los <- as.numeric(admins$dischtime - admins$admittime)

ggplot() + 
  geom_histogram(aes(demo.los), alpha=0.6, fill=4, bins = 30) + 
  ggtitle("Length of Stay in ICU") 

# keep only numeric columns 
nums <- admins %>% keep(is.numeric)

ggcorrplot(cor(nums))

pairs(nums, pch =19, lower.panel=panel.smooth, cex = 0.1, upper.panel = NULL)

Full disclosure–I put together this presentaion on my flight over here yesterday :) below is some scratch that didn’t make it into the pretty graphs above. I’m including it to show you what my thought process is and what real data exploration looks like. It sure as hell ain’t coherent, but you’ll always end up with an interesting answer if you keep digging through the data. Bonus points to whoever makes cool vizzes with these Bonus points mean nothing, but I will really like you as a person

# look at time difference. why is this straight line? Maybe look at boxplot of diffs for each subject
# x + geom_point(aes(admittime, deathtime), alpha=0.3)

# This takes too long but should be investigated. ~ 15,000 diagnosis codes?
# need to find which 15 are most common and shop
# x + geom_bar(aes(diagnosis)) + coord_flip()
# 
# unique(admins$diagnosis)
# [1000] "CORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS GRAFT; ? OFF PUMP/SDA"                              
# [ reached getOption("max.print") -- omitted 14692 entries ]
dbListTables(con)
##  [1] "chartevents_1"      "chartevents_2"      "chartevents_3"     
##  [4] "chartevents_4"      "chartevents_5"      "chartevents_6"     
##  [7] "chartevents_7"      "chartevents_8"      "chartevents_9"     
## [10] "chartevents_10"     "chartevents_11"     "chartevents_12"    
## [13] "chartevents_13"     "chartevents_14"     "chartevents_15"    
## [16] "chartevents"        "admissions"         "callout"           
## [19] "caregivers"         "chartevents_16"     "chartevents_17"    
## [22] "cptevents"          "datetimeevents"     "diagnoses_icd"     
## [25] "drgcodes"           "d_cpt"              "d_icd_diagnoses"   
## [28] "d_icd_procedures"   "d_items"            "d_labitems"        
## [31] "icustays"           "inputevents_cv"     "inputevents_mv"    
## [34] "labevents"          "microbiologyevents" "noteevents"        
## [37] "outputevents"       "patients"           "prescriptions"     
## [40] "procedureevents_mv" "procedures_icd"     "services"          
## [43] "transfers"
stays = dbGetQuery(con, "select * from icustays")

str(stays)
## 'data.frame':    61532 obs. of  12 variables:
##  $ row_id        : int  365 366 367 368 369 370 371 372 373 374 ...
##  $ subject_id    : int  268 269 270 271 272 273 274 275 276 277 ...
##  $ hadm_id       : int  110404 106296 188028 173727 164716 158689 130546 129886 135156 171601 ...
##  $ icustay_id    : int  280836 206613 220345 249196 210407 241507 254851 219649 206327 272866 ...
##  $ dbsource      : chr  "carevue" "carevue" "carevue" "carevue" ...
##  $ first_careunit: chr  "MICU" "MICU" "CCU" "MICU" ...
##  $ last_careunit : chr  "MICU" "MICU" "CCU" "SICU" ...
##  $ first_wardid  : int  52 52 57 52 57 52 12 7 57 56 ...
##  $ last_wardid   : int  52 52 57 23 57 52 12 7 57 56 ...
##  $ intime        : POSIXct, format: "2198-02-14 23:27:38" "2170-11-05 11:05:29" ...
##  $ outtime       : POSIXct, format: "2198-02-18 05:26:11" "2170-11-08 17:46:57" ...
##  $ los           : num  3.25 3.28 2.89 2.06 1.62 ...
t <- ggplot(stays)

# keep only numeric columns 
nums <- stays %>% keep(is.numeric)

ggcorrplot(cor(nums))

# plot histogram of length of stay in the ICU
t + geom_histogram(aes(los), binwidth = 1, fill=I("#9ebcda"), col=I("#FFFFFF")) + 
  xlim(c(0,20)) + 
  ggtitle("Length of stay in the ICU") + 
  xlab("Length of stay, days")

Optional Break

Anything you guys are intersted in exploring in MIMIC? If so, let’s do it together! I promise you’ll see me make mistakes every step of the way, but I think we have enough brainpower in this room to find something cool.

dbListTables(con)
##  [1] "chartevents_1"      "chartevents_2"      "chartevents_3"     
##  [4] "chartevents_4"      "chartevents_5"      "chartevents_6"     
##  [7] "chartevents_7"      "chartevents_8"      "chartevents_9"     
## [10] "chartevents_10"     "chartevents_11"     "chartevents_12"    
## [13] "chartevents_13"     "chartevents_14"     "chartevents_15"    
## [16] "chartevents"        "admissions"         "callout"           
## [19] "caregivers"         "chartevents_16"     "chartevents_17"    
## [22] "cptevents"          "datetimeevents"     "diagnoses_icd"     
## [25] "drgcodes"           "d_cpt"              "d_icd_diagnoses"   
## [28] "d_icd_procedures"   "d_items"            "d_labitems"        
## [31] "icustays"           "inputevents_cv"     "inputevents_mv"    
## [34] "labevents"          "microbiologyevents" "noteevents"        
## [37] "outputevents"       "patients"           "prescriptions"     
## [40] "procedureevents_mv" "procedures_icd"     "services"          
## [43] "transfers"

Sidenote: this workshop was adapted from the HST953 course. If you’re interested in this type of stuff and are a big nerd (~like me~) you should defeintly enroll in the upcoming fall course. More info at criticaldata.mit.edu


In this workshop, we will explore the following visualization topics:

  • Unsupervised Visualizations: This includes histograms, scatterplots and boxplots
  • Supervised Visualizations: This includes model fitting and checking
  • Other Considerations: How to use color, size, labels, etc.

Note that much of this content has been taken from the work of Jesse Raffa and Jeffrey Heer, but data visualization for clinical purposes is not new! Florence Nightangle drew coxcomb charts to demonstrate the impact of disease on troop mortality in 1858, and John Snow famously pinpointed the 1854 outbreak of cholera in London to a public water pump by mapping deaths in relation to pumps.

Prerequisites

Let’s start by loading some data: the arterial line study data and the MIMIC-III Demo data.

The arterial line study looks at the impact of arterial line catheters on mortality in the ICU. The study was originally performed in MIMIC-II, but has been extended to MIMIC-III. The data is available on PhysioNet and can be downloaded from https://physionet.org/works/MIMICIIIarteriallinedatasetHST953/. Subsequently, this can be loaded into a dataframe called aline:

aline <- read.csv("~/Desktop/MIT/Data Viz Workshop/aline-dataset.csv")

Unsupervised Visualizations

Unsupervised data visualization is a fundamentally exploratory process. We often 1) construct graphics to address specific questions, 2) inspect the “answer”, and 3) assess new questions. This can happen many times, especially if you want to demonstrate data variation.

Common Data Transformations

Another important thing to consider is the data you’re working with - you may need to transform data appropriately to help with comparisons, or better approximate a normal distribution. Some common transforms that arise in practice are:

Transform Operation Common Use
Normalization \((x - mean(x)) / std(x)\) Convenience of zero-mean, unit variance.
Reciprocal \(1 / x\) Reversing order among values of same sign (i.e. largest becomes smallest).
Log \(log(x)\) Reducing right skewedness.
Cube Root \(x^{1/3}\) Reducing right skewedness. Can be applied to zero and negative values.
Square \(x^2\) Reducing left skewedness.
Box-Cox \((x^\lambda - 1)/ \lambda , \lambda \neq 0\), \(log(x) ,\lambda = 0\) Obtain a “normal” shape.

While these are common statistical transforms, they are by no means exhaustive. Many transforms arise in the form of preprocessing. These can include binning (e.g., as a means of discretizing continuous data), and grouping (e.g., merging categorical values that share a single semantic meaning).

There is no exercise for this section.


Histogram

The histogram characterizes the distribution of a variable by plotting the frequency that a numeric variable occurs within intervals called bins. For example, we plot the length of stay in the ICU below as a histogram using the hist function. R chooses it’s bin size adaptively, but sometimes it doesn’t make a good choice. You can change the number of bins by specifying the breaks argument. The breaks are related to the number of bins in the plot. Increasing the breaks to 100, yields the second plot with. It’s also common to transform the data, and you can see below, for a long-tailed variable like length of stay, taking the log reduces the “tailedness” of the distribution. hist and R plots in general are very customizable, and handle lots of additional arguments. We have included a couple here.

hist(aline$icu_los_day,main="Length of Stay in ICU",xlab="Length of Stay in ICU",col="grey")

hist(aline$icu_los_day,main="Length of Stay in ICU",xlab="Length of Stay in ICU",col="grey",breaks=100)

hist(log(aline$icu_los_day),main="Length of Stay in ICU",xlab="Length of Stay in ICU",col="grey")

Let’s also take a look at length of stay in the MIMIC-III demo data.

demo.admissions <- dbReadTable(con, "admissions")
demo.los <- as.numeric(demo.admissions$dischtime - demo.admissions$admittime)
hist(demo.los,main="Length of Stay in ICU",xlab="Length of Stay in ICU",col="grey", breaks=50)

Already we’ll notice that the scale is wildly different. In this case, it looks like that’s because demo.los is actually in minutes. Let’s go ahead and change that to the scale of days used previously.

hist(demo.los/(60*24),main="Length of Stay in ICU",xlab="Length of Stay in ICU",col="grey", breaks=50)

Density Estimation

A density esimate of a numeric variable is related to its histogram, as it also tries to characterize the distribution of the variable through computing its density. Without going into too much technical detail, you can think of a density as a scaled version of a “continuous histogram”. We do this by using the density function in R. Like, summary, plot is also a generic function that you can pass to many types of data strucutures, including a density object. Consider the density estimates of the ICU length-of-stay shown below. You will see similar to histograms, density estimates also have a parameter (bw). This controls the smoothness of the estimate. We vary it from 2 to 0.1, and you can see how it affects the estimate of the density.

plot(density(aline$icu_los_day),main="LOS ICU, bw=default",xlab="LOS ICU",ylab="Density Estimate")

plot(density(aline$icu_los_day,bw=2),main="LOS ICU, bw=2",xlab="LOS ICU",ylab="Density Estimate")

plot(density(aline$icu_los_day,bw=.1),main="LOS ICU, bw=0.1",xlab="LOS ICU",ylab="Density Estimate")

Exercise 1:

  1. Plot a histogram of SOFA score. Include an appropriate axis label and title. Make the histogram blue.
  2. Vary the number of bins, include in your report when the number of bins are 3 and 30.
  3. Explain in your own words why the histograms in b) look the way they do, and discuss if R did a good job in picking the number of bins.
  4. What happens when you try to do a histogram for aline_flg?
  5. Plot the density of SOFA score using the default bw setting. Vary the bw parameter, and include when bw=0.1 and 2 in your report.
  6. Comment briefly on which bw setting was the best.
  7. Do you think the distribution of SOFA is bimodal based on your investigation?

Scatterplot

So far, all the plots have only considered one variable at a time. Looking at two or more variables at a time can be done through scatter plots. For instance, the plot below shows the first sodium (x axis) versus the first creatinine (y axis). Again, the plotting functions have dozens of arguments you can pass, here we pass xlab (label of x axis), ylab (label of y axis) and pch, which controls the type of points the plot uses (in our case 20 is solid points)

plot(aline$sodium_first, aline$creatinine_first, pch=20, xlab="First Sodium", ylab="First Creatinine")

Including additional variables can be done carefully. For instance, here we plot the same plot but identify those with renal disease using the color argument, which we pass the renal_flg variable.

plot(aline$sodium_first, aline$creatinine_first, pch=20, col=as.factor(aline$renal_flg))

Exercise 2:

  1. Plot SOFA Score (x-axis) vs age (y-axis). Make sure to add an appropriate label for your axes and title.
  2. Color code those who survived and died using hosp_exp_flg. The default coding will make dead = red and black = survivors.
  3. Can you say anything about the relationship between age, SOFA and hospital mortality? If it’s difficult say why.
  4. Run the jitter function on age and sofa_first, but NOT hosp_exp_flg and replot the data Describe what jitter does? Can you say anything now about the relationship?
  5. Try adding ylim=c(16,90) to your plot function call. What does this do?
  6. Briefly describe the relationship between age, SOFA and in hospital mortality.

Boxplot

When trying to compare numeric variables across different levels of a categorical or factor variable, it’s often useful to use a boxplot. The boxplot function provides an easy way to do this. Boxplots use a useful syntax that will later be used in other types of analyses. Essentially you specify a formula of the form y~x, where y is what you want on the y axis, and x is what you want on the x axis (a categorical variable). Because we are not prefixing the formula with aline$, we pass data=aline to tell the function where to find these variables. For example, below are two boxplots. The first plots creatinine_first by renal_flg. Most of the previous arguments for the generic plot functions will work here as well, and we have included x and y axis labels (xlab and ylab).

boxplot(creatinine_first~renal_flg, data=aline, ylab="First Creatinine", xlab="Renal Disease")

boxplot(map_first ~ service_unit, data=aline, cex.axis=0.6,ylab="MAP", xlab="Service Unit")

In the second plot we have plotted map_first by service_unit to illustrate that boxplots can have multiple levels of the categorical variable – not just two. We add cex.axis=0.6 to make the group labels fit on the axis.

Exercise 3:

  1. Compute a boxplot for SOFA by hosp_exp_flg. Do those who died have higher or lower SOFAs on average?
  2. How many outliers do you see in a)’s plot?
  3. Apply the jitter function to the sofa_first variable.
  4. How many outliers are there now?

Interaction Plot

Interaction plots allow us to visualize when the effect of one categorical variable depends on a second categorical variable. In these plots, parallel lines indicate that there is no interaction; while a greater difference in slope between the lines indicates a higher degree of interaction. These plots are useful for quickly identifying effects, but do not show the corresponding significance of effect. A subsequent ANOVA test can be used to evaluate the statistical significance of any effects that are found. If strong interactions do exist, they must be considered when addressing main effects.

In the following plots we see, 1) An interaction between variables pneumonia_flg and ards_flg when considering temp_first, 2) Several interactions between variables copd_flg and service_unit when considering map_first, and 3) No interaction between variables chf_flg and renal_flg when considering creatine_first.

interaction.plot(aline$pneumonia_flg,aline$ards_flg,aline$temp_first,fun = function(x) mean(x, na.rm = TRUE))

interaction.plot(aline$copd_flg,aline$service_unit,aline$map_first,fun = function(x) mean(x, na.rm = TRUE))

interaction.plot(aline$chf_flg,aline$renal_flg,aline$creatinine_first,fun = function(x) mean(x, na.rm = TRUE))

interaction.plot(aline$chf_flg,aline$renal_flg,aline$creatinine_first,fun = function(x) median(x, na.rm = TRUE))

There is no exercise for this section.


Supervised Visualizations

Supervised data visualization is often used to explicitly test hypotheses about how data are generated or how they relate. Often this requires plotting several visuals on a single plot. For example, in the case of plotting two Gaussians (obtained with dnorm), you can clearly see the separation of means.

x <- seq(0, 10, 0.01)
plot(x, dnorm(x, 3, 1), type="l")   # mean = 3
lines(x, dnorm(x, 7, 1), col="red") # mean = 7

Hypothesis Testing

Hypothesis testing examines the probability that a pattern might have arisen by chance. A statistical hypothesis test assesses the likelihood of the null hypothesis. For example, what is the probability of sampling the observed data assuming the population means are equal? (Null Hypothesis, Alternate Hypothesis)

In this process, we often compute a test statistic. This is a number that in essence summarizes the difference. The possible values of this statistic come from a known probability distribution. According to this distribution, we determine the probability of seeing a value meeting or exceeding the test statistic, which is called a p-value. For example, \[Z = \frac{\mu_m - \mu_f}{\sqrt{\sigma^2_m / N_m + \sigma^2_f / N_f}}\].

We also need to choose a threshold at which we consider it safe (or reasonable?) to reject the null hypothesis. If \(p < 0.05\), we typically say that the observed effect or difference is statistically significant. This means that there is a less than 5% chance thatthe observed data is due to chance. Note that the choice of 0.05 is a somewhat arbitrary threshold (chosen by R. A. Fisher).

Common Statistical Methods

For testing particular relationships, the following tests are often used:

Question Data Type Parametric Non-Parametric
Do data distributions have different “centers”? 2 uni. dists t-Test Mann-Whitney U
> 2 uni. dists ANOVA Kruskal-Wallis (aka “location” tests)
> 2 multi. dists MANOVA Median Test
Are observed counts significantly different? Counts in categories Chi-squared
Are two vars related? 2 variables Pearson coeff. Rank correl.
Do 1 (or more) variables predict another? Continuous Linear regression
Binary Logistic regression

Exercise 4

  1. Count the number of men and women in the dataset, and find the mean and standard deviation for the length of stay in each population.
  2. Do you think that the difference in the length of stay between genders is significant?
  3. Plot two Guassian curve representing these two groups on the same (shared) plot.
  4. Compute the teset statistic between the two populations.
  5. Is the difference in length of stay statistically significant? In other words: assuming no true difference, what is the probability that our data is due to chance?

Prediction/Model-Driven Data Validation

Another common check during modeling is to examine how well one (or more) data variables predict values of interest. In this setting, we may apply data transformations, check for model predictions, and compute residuals. We first want to propose a model to fit our data, for example age and length of stay. We can then visualize how well the curve fits the data in three ways.

  • Plot a Quantile-Quantile plot to examine the fit of the two variables.
  • Plot a curve to fit the data to show the general fit of the family (model in data space).
  • Plot residual graph (vertical distance from best fit curve) to show accuracy of fit (data in model space).

Exercise 5

  1. Plot the relationship between age and length of stay in a QQ plot and separately in a scatterplot. Does the QQ plot reveal anything the scatter plot does not?
  2. Fit a quadratic curve to the points, show this line amongst the actual points in the graph. This is a visualization of the model in data space.
  3. Compute the residuals for each point, and compare the error across various values of age. What do you notice?

Summarization

Another important use for visualizations is as a first step of summarization. By plotting data relationships, we can examine what parameters best fit our data to a given function, and what is the goodness of fit of that function in general. Visualizations can highlight problems with models, e.g. over and under fitting to a particular trend. For this, we estimate non-parametric regression in R with lowess - short for locally weighted scatterplot smoothing. Lowess is a special case of outlier resistant non-parametric regression, where we draw a smooth curve to summarize a relationship between the plotted variables using both a local polynomial least squares fit and an adjusted final fit.

Exercise 6

  1. Generate 100 independent Gaussian random variables for x and y with zero mean and unit variance, and visualize them with a scatterplot.
  2. Generate the lowess fit for the data with a low smoother span (f). Note that the smoother span gives the proportion of points in the plot which influence the smooth at each value.
  3. Generate the lowess fit for the data with a high smoother span (f)
  4. What could go wrong with both smoother sapns specified to summarize a real timeseries?

Other Considerations

Subtle choices in the visualization of data can greatly affect interpretation. Aspects such as color, size, spacing, binning, labels, and many others have strong impacts on how we perceive and understand visual information. While these topics are indeed the subject of entire lectures on their own, below are a few considerations.

Improper Graph Choice

There are some instances where the nature of the data makes a particular graph misleading. For example, we can use the rpois function to simulate any number of independent Poisson random variables with parameter \(\lambda\). We could then visualize this vector of values, \(V\), in several ways to get an estimate of the probability distribution \(P(V = v)\).

Scaling and Extraction

Extraction of a particular span of time in data can be useful when looking for outliers, or investigating a pattern, but these extracted graphs should be representative of the original data. This can be particularity bad when extraction creates truncated axis labels. For example, showing a much smaller portion of the vertical axis can make small differences look big; extracting a smaller portion of the horizontal axis can also make small changes look larger.

Color

Generally, there are a few good rules of thumb to consider when choosing colors. - Use only a few colors (\(< 6\) ideally). - Colors should be distinctive and clear to all audiences, including the color blind. - Strive for color harmony. - Use cultural conventions and appreciate symbolism. - Get it right in black and white. - Take advantage of perceptual color spaces.

Exercise 7

  1. Generate 1000 indepedent Poisson random variables with \(\lambda = 1\).
  2. Plot an estimate of the probability distribution \(P(V=v)\) using density.
  3. Plot an estimate of the probability distribution \(P(V=v)\) using barplot.
  4. Is the density plot or bar plot more appropriate? Why?
  5. Create a plot from data of your choice that is misleading due to color, scale, or any other means.

# close the connection
dbDisconnect(con)
## [1] TRUE
dbUnloadDriver(drv)
## [1] TRUE